ROI Baseball Analytics

Author

Brock Ellis

Show the code
import pandas as pd 
import numpy as np
import sqlite3
from lets_plot import *

LetsPlot.setup_html(isolated_frame=True)
Show the code
sqlite_file = 'lahman_1871-2022.sqlite'
con = sqlite3.connect(sqlite_file)
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", con)

ROI Baseball Salaries

This section of the analysis will have the purpose of seeing if salary expenditure on teams is related to post season success.

Show the code
# Perform a query that returns the total salary, total wins, total losses, per year and team
base = pd.read_sql_query("""
WITH ps AS (
  -- collect game wins for both winners and losers of each postseason series
  SELECT yearID, teamIDwinner AS teamID, wins   AS gwins FROM SeriesPost
  UNION ALL
  SELECT yearID, teamIDloser  AS teamID, losses AS gwins FROM SeriesPost
),
ps_agg AS (
  SELECT yearID, teamID, SUM(gwins) AS postseason_wins
  FROM ps
  GROUP BY yearID, teamID
),
ws AS (
  SELECT yearID, teamIDwinner AS teamID, 1 AS won_ws
  FROM SeriesPost
  WHERE round = 'WS'
)
SELECT
    t.yearID,
    t.name,
    SUM(s.salary) AS total_salary,
    t.W AS wins,
    t.L AS losses,
    COALESCE(ps_agg.postseason_wins, 0) AS postseason_wins,   -- <-- total PS wins (all rounds)
    COALESCE(ws.won_ws, 0) AS won_world_series                 -- 1 if WS champ, else 0
FROM teams t
JOIN salaries s
  ON t.teamID = s.teamID
 AND t.yearID = s.yearID
LEFT JOIN ps_agg
  ON ps_agg.teamID = t.teamID
 AND ps_agg.yearID = t.yearID
LEFT JOIN ws
  ON ws.teamID = t.teamID
 AND ws.yearID = t.yearID
GROUP BY t.teamID, t.yearID, t.name, t.W, t.L, ps_agg.postseason_wins, ws.won_ws
ORDER BY t.yearID;
""", con)


# Calculate a wins per million dollars expended column
base['wins_per_million'] = base['wins'] / (base['total_salary'] / 1_000_000)

# Change won_world_series to boolean
base['won_world_series'] = base['won_world_series'].astype('bool')

# Convert Salary to millions
base['total_salary'] = base['total_salary'] / 1000000
Show the code
ggplot(base, aes(x="total_salary")) +\
  geom_histogram()
Show the code
ggplot(base, aes(x="wins")) +\
  geom_histogram()
Show the code
ggplot(base, aes(x="won_world_series", y='total_salary', fill='won_world_series')) +\
  geom_boxplot() +\
  scale_y_continuous(limits=(0,200))
Show the code
ggplot(base, aes(x="won_world_series", y='wins', fill='won_world_series')) +\
  geom_boxplot() +\
  scale_y_continuous(limits=(40,120))
Show the code
ggplot(base, aes(x="won_world_series", y='wins_per_million', fill='won_world_series')) +\
  geom_boxplot() +\
  scale_y_continuous(limits=(0,10))
Show the code
ggplot(base, aes(x="wins", y='total_salary', col='won_world_series')) +\
  geom_point() +\
  geom_smooth(method='lm') +\
  scale_y_continuous(limits=(0,200))
Show the code
import statsmodels.formula.api as smf

model_simple = smf.ols(
    "total_salary ~  wins + won_world_series",
    data=base
).fit(cov_type="HC3")

print(model_simple.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           total_salary   R-squared:                       0.065
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     34.40
Date:                Sun, 28 Sep 2025   Prob (F-statistic):           3.93e-15
Time:                        15:16:07   Log-Likelihood:                -4730.6
No. Observations:                 918   AIC:                             9467.
Df Residuals:                     915   BIC:                             9482.
Df Model:                           2                                         
Covariance Type:                  HC3                                         
============================================================================================
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                  -13.1447      8.626     -1.524      0.128     -30.052       3.762
won_world_series[T.True]     4.6751      9.626      0.486      0.627     -14.191      23.541
wins                         0.9135      0.114      8.006      0.000       0.690       1.137
==============================================================================
Omnibus:                      137.193   Durbin-Watson:                   0.773
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              212.963
Skew:                           1.000   Prob(JB):                     5.70e-47
Kurtosis:                       4.251   Cond. No.                         584.
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)